rm(list=ls())
library(plyr)
library(lubridate)
library(dplyr)
library(ggplot2)
library(ROCR)
library(ggmap)
library(rpart)
library(curl)
library(jsonlite)
library(broom)
library(rgdal)
library(scales)

Read in data and graph injuries

#d.in <- read.csv('~/Desktop/visionzero/data/final/injury_monthly_final.csv', header = TRUE)
d.in <- read.csv('~/Desktop/data_analysis/project/visionzero/data/final/injury_monthly_final.csv', header = TRUE)

#Filter out intersections w/o injuries
d.in <- d.in %>%
  filter(MN != 0)


#All Injuries
g <- ggplot(data = d.in, aes(x=month(MN,label=TRUE,abbr=TRUE), y=Injuries_COUNT)) + 
  geom_bar(stat="identity", fill="steelblue") + 
  labs(x = "Month", y = "Injuries") + 
  ggtitle("Monthly Injuries (2016)") 

plot(g)

#Pedestrian Injuries
g <- ggplot(data = d.in, aes(x=month(MN,label=TRUE,abbr=TRUE), y=PedInjurie_COUNT)) + 
  geom_bar(stat="identity", fill="steelblue")  + 
  labs(x = "Month", y = "Injuries") + 
  ggtitle("Pedestrian Injuries (2016)") 

plot(g)

#Bike Injuries
g <- ggplot(data = d.in, aes(x=month(MN,label=TRUE,abbr=TRUE), y=BikeInjuri_COUNT)) + 
  geom_bar(stat="identity", fill="steelblue") + 
  labs(x = "Month", y = "Injuries") + 
  ggtitle("Bike Injuries (2016)") 

plot(g) 

#Motor Vehicle Injuries
g <- ggplot(data = d.in, aes(x=month(MN,label=TRUE,abbr=TRUE), y=MVOInjurie_COUNT)) + 
  geom_bar(stat="identity", fill="steelblue") + 
  labs(x = "Month", y = "Injuries") + 
  ggtitle("Motor Vehicle Injuries (2016)") 

plot(g)

#Note: We had trouble grouping data by type and making a stacked bar chart, so did it in excel.

Plot injuries on a map

d.in <- read.csv('~/Desktop/data_analysis/project/visionzero/data/final/injury_monthly_final.csv', header = TRUE)
#d.in <- read.csv('~/Desktop/visionzero/data/final/injury_monthly_final.csv', header = TRUE)

# Set a base map
nyc <- get_map(location = c(lon = -73.935, lat = 40.712), zoom = 11)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=40.712,-73.935&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
#Bike injuries by month
bike_inj <- d.in %>% filter(Injuries == 1 & BikeInjuri == 1)

g <- ggmap(nyc) + geom_point(data = bike_inj, aes(x = Lon, y = Lat, fill=month(MN,label=TRUE,abbr=TRUE)),size = 1, shape = 21)  + ggtitle("Bike Injuries (2016)") 

plot(g)
## Warning: Removed 51 rows containing missing values (geom_point).

#Pedestrian injuries by month
ped_inj <- d.in %>% filter(Injuries == 1 & PedInjurie == 1)

g <- ggmap(nyc) + geom_point(data = ped_inj, aes(x = Lon, y = Lat, fill=month(MN,label=TRUE,abbr=TRUE)),size = 1, shape = 21) + ggtitle("Pedestrian Injuries (2016)") 

plot(g)
## Warning: Removed 274 rows containing missing values (geom_point).

#MVO injuries by month
mvo_inj <- d.in %>% filter(Injuries == 1 & MVOInjurie == 1)

g <- ggmap(nyc) + geom_point(data = mvo_inj, aes(x = Lon, y = Lat, fill=month(MN,label=TRUE,abbr=TRUE)),size = 1, shape = 21) + ggtitle("Motor Vehicle Injuries (2016)") 

plot(g)
## Warning: Removed 908 rows containing missing values (geom_point).

#format dataset - turn variables into factors

d.in <- d.in %>%
  mutate(Injuries = as.factor(Injuries),
         PedInjurie = as.factor(PedInjurie),
         BikeInjuri = as.factor(BikeInjuri),
         MVOInjurie = as.factor(MVOInjurie),
         bike_priority_districts = as.factor(bike_priority_districts),
         enhanced_crossings = as.factor(enhanced_crossings),
         left_turn_traffic_calming = as.factor(left_turn_traffic_calming),
         neighborhood_slow_zones = as.factor(neighborhood_slow_zones), 
         leading_pedestrian_interval_signals = as.factor(leading_pedestrian_interval_signals), 
         signal_timing = as.factor(signal_timing), 
         speed_humps = as.factor(speed_humps), 
         safe_streets_for_seniors = as.factor(safe_streets_for_seniors),
         street_improvement_projects_corridors = as.factor(street_improvement_projects_corridors),
         vz_priority_corridors = as.factor(vz_priority_corridors),
         vz_priority_intersections = as.factor(vz_priority_intersections),
         arterial_slow_zones = as.factor(arterial_slow_zones), 
         MN = as.factor(MN),
         #MN = as.character.numeric_version(MN),
         street_improvement_projects_corridors = as.factor(street_improvement_projects_corridors),
         vz_priority_zones = as.factor(vz_priority_zones))

Multiple Logstic Regressions

injury_monthly.glm <- glm(Injuries ~ MN + enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + leading_pedestrian_interval_signals + signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.in)
## Warning: glm.fit: algorithm did not converge
#summary(injury_yearly.glm)
#exp(coef(injury_yearly.glm))
#exp(confint(injury_yearly.glm))


logistic.injurymonthly.pedinjury <- glm(PedInjurie ~ MN + arterial_slow_zones + enhanced_crossings + speed_humps + left_turn_traffic_calming + leading_pedestrian_interval_signals + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.in)

#summary(logistic.injuryyearly.pedinjury)

 
logistic.injurymonthly.bikeinjury <- glm(BikeInjuri ~ MN + bike_priority_districts + enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + signal_timing + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.in)

#summary(logistic.injuryyearly.bikeinjury) 

logistic.injurymonthly.MVOinjurie <- glm(MVOInjurie ~ arterial_slow_zones + enhanced_crossings + speed_humps + left_turn_traffic_calming + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.in)

Build Predictive Model

#GENERAL INJURY 
#split data 70-30

set.seed(123)  
t_index <- sample(seq_len(nrow(d.in)), size = floor(.70*nrow(d.in)), replace = F)

d.train <- d.in[t_index, ]
d.test <- d.in[-t_index, ] 

#train logistic regression take 1 (all)
m.log <- glm(Injuries ~ bike_priority_districts + enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + leading_pedestrian_interval_signals + signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors + vz_priority_corridors + vz_priority_intersections + vz_priority_zones, data = d.train, family = "binomial")

a <- data.frame(exp(confint(m.log)))
## Waiting for profiling to be done...
b <- data.frame(exp(coef(m.log)))



#train logistic regression take 2 (only real safety features)
#m.log <- glm(Injuries ~ enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + leading_pedestrian_interval_signals + signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors + neighborhood_slow_zones, data = d.train, family = "binomial")

#train logistic regression take 3 (only problem zones)
#m.log <- glm(Injuries ~ bike_priority_districts + vz_priority_corridors + vz_priority_intersections + vz_priority_zones, data = d.train, family = "binomial")

#predict injury
d.test$Predicted_Injury <- predict(m.log, newdata = d.test, type = "response")

#optimal cutoff seems to be 0.1
d.test$Predicted_Injury <- ifelse(d.test$Predicted_Injury >= 0.2,
                                   1,
                                   0) 

# TP
tp <- d.test %>%
  filter(Injuries == 1 & Predicted_Injury == 1) %>% nrow()
# TN
tn <- d.test %>%
  filter(Injuries == 0 & Predicted_Injury == 0) %>% nrow()
# FP
fp <- d.test %>%
  filter(Injuries == 0 & Predicted_Injury == 1) %>% nrow()
# FN
fn <- d.test %>%
  filter(Injuries == 1 & Predicted_Injury == 0) %>% nrow()

# Sensitivity (or TPR)
sens <- tp/(tp+fn)

# Specificity ( 1 - FPR)
spec <- tn/(tn+fp)

# Accuracy
acc <- (fp+tp)/(tp+tn+fp+fn)

# FPR (1- Specificity)
1 - (tn/(tn+fp))
## [1] 0.1500634
# FNR
fn/(fn+tp)
## [1] 0.4030172
#find AUC
pred <- prediction(predictions = d.test$Predicted_Injury, labels = d.test$Injuries)
roc.perf <- performance(pred, measure = "tpr", x.measure = "fpr")

plot(roc.perf)

auc.perf <- performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.7234597
#Decision Tree
mtree.1 <- rpart(Injuries ~ bike_priority_districts + enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + leading_pedestrian_interval_signals + signal_timing + safe_streets_for_seniors + street_improvement_projects_corridors + vz_priority_corridors + vz_priority_intersections + vz_priority_zones, data=d.train, method="class")
plot(mtree.1)
text(mtree.1, pretty = 1)

d.var_imp <-data.frame(mtree.1$variable.importance)
names(d.var_imp) <- "importance"
d.var_imp$variable <-as.factor(rownames(d.var_imp))
d.var_imp <-transform(d.var_imp, variable=reorder(variable, -importance) )

filt5 <- d.var_imp %>% top_n(-5)
## Selecting by variable
g <-ggplot(filt5,aes(x=variable, y=importance)) + 
  geom_bar(stat="identity")
plot(g) 

#PEDESTRIAN INJURY 
set.seed(123)  
t_index <- sample(seq_len(nrow(d.in)), size = floor(.70*nrow(d.in)), replace = F)

d.train <- d.in[t_index, ]
d.test <- d.in[-t_index, ] 

#train logistic regression 
m.log <- glm(PedInjurie ~ arterial_slow_zones + enhanced_crossings + speed_humps + left_turn_traffic_calming + leading_pedestrian_interval_signals + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.test)


#predict injury
d.test$Predicted_Injury <- predict(m.log, newdata = d.test, type = "response")

#optimal cutoff seems to be 0.1
d.test$Predicted_Injury <- ifelse(d.test$Predicted_Injury >= 0.05,
                                   1,
                                   0) 

# TP
tp <- d.test %>%
  filter(Injuries == 1 & Predicted_Injury == 1) %>% nrow()
# TN
tn <- d.test %>%
  filter(Injuries == 0 & Predicted_Injury == 0) %>% nrow()
# FP
fp <- d.test %>%
  filter(Injuries == 0 & Predicted_Injury == 1) %>% nrow()
# FN
fn <- d.test %>%
  filter(Injuries == 1 & Predicted_Injury == 0) %>% nrow()

# Sensitivity (or TPR)
sens <- tp/(tp+fn)

# Specificity ( 1 - FPR)
spec <- tn/(tn+fp)

# Accuracy
acc <- (fp+tp)/(tp+tn+fp+fn)

# FPR (1- Specificity)
1 - (tn/(tn+fp))
## [1] 0.1222029
# FNR
fn/(fn+tp)
## [1] 0.5208333
#find AUC
pred <- prediction(predictions = d.test$Predicted_Injury, labels = d.test$Injuries)
roc.perf <- performance(pred, measure = "tpr", x.measure = "fpr")

plot(roc.perf)

auc.perf <- performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.6784819
#BIKE INJURY 
set.seed(123)  
t_index <- sample(seq_len(nrow(d.in)), size = floor(.70*nrow(d.in)), replace = F)

d.train <- d.in[t_index, ]
d.test <- d.in[-t_index, ] 

#train logistic regression 
m.log <- glm(BikeInjuri ~ enhanced_crossings + speed_humps + left_turn_traffic_calming + arterial_slow_zones + signal_timing + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.test)


#predict injury
d.test$Predicted_Injury <- predict(m.log, newdata = d.test, type = "response")

#optimal cutoff seems to be 0.1
d.test$Predicted_Injury <- ifelse(d.test$Predicted_Injury >= 0.03,
                                   1,
                                   0) 

# TP
tp <- d.test %>%
  filter(Injuries == 1 & Predicted_Injury == 1) %>% nrow()
# TN
tn <- d.test %>%
  filter(Injuries == 0 & Predicted_Injury == 0) %>% nrow()
# FP
fp <- d.test %>%
  filter(Injuries == 0 & Predicted_Injury == 1) %>% nrow()
# FN
fn <- d.test %>%
  filter(Injuries == 1 & Predicted_Injury == 0) %>% nrow()

# Sensitivity (or TPR)
sens <- tp/(tp+fn)

# Specificity ( 1 - FPR)
spec <- tn/(tn+fp)

# Accuracy
acc <- (fp+tp)/(tp+tn+fp+fn)

# FPR (1- Specificity)
1 - (tn/(tn+fp))
## [1] 0.1069607
# FNR
fn/(fn+tp)
## [1] 0.5267002
#find AUC
pred <- prediction(predictions = d.test$Predicted_Injury, labels = d.test$Injuries)
roc.perf <- performance(pred, measure = "tpr", x.measure = "fpr")

plot(roc.perf)

auc.perf <- performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.6831696
#MVO INJURY 
set.seed(123)  
t_index <- sample(seq_len(nrow(d.in)), size = floor(.70*nrow(d.in)), replace = F)

d.train <- d.in[t_index, ]
d.test <- d.in[-t_index, ] 

#train logistic regression 
m.log <- glm(MVOInjurie ~ arterial_slow_zones + enhanced_crossings + speed_humps + left_turn_traffic_calming + street_improvement_projects_corridors + neighborhood_slow_zones, family = "binomial", data = d.test)


#predict injury
d.test$Predicted_Injury <- predict(m.log, newdata = d.test, type = "response")

#optimal cutoff seems to be 0.1
d.test$Predicted_Injury <- ifelse(d.test$Predicted_Injury >= 0.1,
                                   1,
                                   0) 

# TP
tp <- d.test %>%
  filter(Injuries == 1 & Predicted_Injury == 1) %>% nrow()
# TN
tn <- d.test %>%
  filter(Injuries == 0 & Predicted_Injury == 0) %>% nrow()
# FP
fp <- d.test %>%
  filter(Injuries == 0 & Predicted_Injury == 1) %>% nrow()
# FN
fn <- d.test %>%
  filter(Injuries == 1 & Predicted_Injury == 0) %>% nrow()

# Sensitivity (or TPR)
sens <- tp/(tp+fn)

# Specificity ( 1 - FPR)
spec <- tn/(tn+fp)

# Accuracy
acc <- (fp+tp)/(tp+tn+fp+fn)

# FPR (1- Specificity)
1 - (tn/(tn+fp))
## [1] 0.1192252
# FNR
fn/(fn+tp)
## [1] 0.5763889
#find AUC
pred <- prediction(predictions = d.test$Predicted_Injury, labels = d.test$Injuries)
roc.perf <- performance(pred, measure = "tpr", x.measure = "fpr")

plot(roc.perf)

auc.perf <- performance(pred, measure = "auc")
auc.perf@y.values
## [[1]]
## [1] 0.6521929